# erin baggott carter
# october 15, 2019
# this file constructs an event level dataset from TABARI output, and saves it as "AmericanDiplomacyDataset.csv"
# it then constructs a month level dataset with covariates and saves it as "FINALDATA.csv"

library(events)
library(plyr)
library(countrycode)
library(foreign)

setwd('~/Dropbox/Diversionary Aggression Paper/II Final Submission/Replication Files')

################################################################

# load event data from TABARI text output file
dat <- read_eventdata('globalDiplomaticEvents.txt', sep='\t', col.format="DSTCLQ", one.a.day=T, date.format="%Y%m%d")

dat$topic = as.numeric(substr(dat$quote,6,6)) # 1 = polmil, 2 = economic
nrow(dat) # 132,429

# collapse all entities into national governments -- ignoring LEG, EXEC, MIL, etc
dat$source = substr(dat$source,1,3)
dat$target = substr(dat$target,1,3)

# restrict to events involving US
dat <- dat[dat$source=="USA" | dat$target=="USA",] 
dat <- dat[(dat$source=="USA" & dat$target=="USA")==F,] # no US-US aggression

# limit to 1946-2010
dat <- dat[dat$date>"1946-01-01",]
nrow(dat) 

################################################################

# apply cameo scores (the ones that appear in this dataset)

sort(unique(dat$code))
dat$scale[dat$code=="14O"] <- -6.5  # fix one botched code here before numericizing
dat$code <- as.numeric(as.vector(dat$code))
dat$scale[dat$code==010] <- 0
dat$scale[dat$code==011] <- -.1
dat$scale[dat$code==012] <- -.4
dat$scale[dat$code==013] <- .4
dat$scale[dat$code==014] <- 0
dat$scale[dat$code==015] <- 0
dat$scale[dat$code==016] <- 3.4
dat$scale[dat$code==017] <- 0
dat$scale[dat$code==018] <- 3.4
dat$scale[dat$code==019] <- 3.4#
dat$scale[dat$code==020] <- 3
dat$scale[dat$code==021] <- 3.4
dat$scale[dat$code==0211] <- 3.4
dat$scale[dat$code==0213] <- 3.4#
dat$scale[dat$code==0214] <- 3.4#
dat$scale[dat$code==022] <- 3.4
dat$scale[dat$code==023] <- 3.4
dat$scale[dat$code==0231] <- 3.4
dat$scale[dat$code==0232] <- 3.4
dat$scale[dat$code==0233] <- 3.4
dat$scale[dat$code==0234] <- 3.4
dat$scale[dat$code==024] <- -.3
dat$scale[dat$code==0241] <- -.3
dat$scale[dat$code==0242] <- -.3
dat$scale[dat$code==0243] <- -.3
dat$scale[dat$code==0244] <- -.3
dat$scale[dat$code==025] <- -.3
dat$scale[dat$code==0253] <- -.3#
dat$scale[dat$code==0254] <- -.3#
dat$scale[dat$code==0256] <- -.3#
dat$scale[dat$code==026] <- 4
dat$scale[dat$code==027] <- 4
dat$scale[dat$code==028] <- 4
dat$scale[dat$code==030] <- 4
dat$scale[dat$code==031] <- 5.2
dat$scale[dat$code==0311] <- 5.2
dat$scale[dat$code==0312] <- 5.2
dat$scale[dat$code==032] <- 4.5
dat$scale[dat$code==033] <- 5.2
dat$scale[dat$code==0331] <- 5.2
dat$scale[dat$code==0332] <- 5.2
dat$scale[dat$code==0333] <- 5.2
dat$scale[dat$code==0334] <- 6
dat$scale[dat$code==034] <- 7
dat$scale[dat$code==0341] <- 7
dat$scale[dat$code==0344] <- 7
dat$scale[dat$code==035] <- 7
dat$scale[dat$code==0351] <- 7
dat$scale[dat$code==0353] <- 7
dat$scale[dat$code==0355] <- 7
dat$scale[dat$code==0356] <- 7
dat$scale[dat$code==036] <- 4
dat$scale[dat$code==037] <- 5
dat$scale[dat$code==038] <- 7
dat$scale[dat$code==039] <- 5
dat$scale[dat$code==040] <- 1
dat$scale[dat$code==041] <- 1
dat$scale[dat$code==042] <- 1.9
dat$scale[dat$code==043] <- 2.8
dat$scale[dat$code==044] <- 2.5
dat$scale[dat$code==045] <- 5
dat$scale[dat$code==046] <- 7
dat$scale[dat$code==050] <- 3.5
dat$scale[dat$code==051] <- 3.4
dat$scale[dat$code==052] <- 3.5
dat$scale[dat$code==053] <- 3.8
dat$scale[dat$code==054] <- 6
dat$scale[dat$code==055] <- 7
dat$scale[dat$code==056] <- 7
dat$scale[dat$code==057] <- 8
dat$scale[dat$code==060] <- 6
dat$scale[dat$code==061] <- 6.4
dat$scale[dat$code==062] <- 7.4
dat$scale[dat$code==063] <- 7.4
dat$scale[dat$code==064] <- 7
dat$scale[dat$code==070] <- 7
dat$scale[dat$code==071] <- 7.4
dat$scale[dat$code==072] <- 8.3
dat$scale[dat$code==073] <- 7.4
dat$scale[dat$code==074] <- 8.5
dat$scale[dat$code==075] <- 7
dat$scale[dat$code==080] <- 5
dat$scale[dat$code==081] <- 5
dat$scale[dat$code==0811] <- 5
dat$scale[dat$code==0812] <- 5
dat$scale[dat$code==0813] <- 5
dat$scale[dat$code==0814] <- 5
dat$scale[dat$code==082] <- 5
dat$scale[dat$code==083] <- 5
dat$scale[dat$code==0831] <- 5
dat$scale[dat$code==0833] <- 5
dat$scale[dat$code==0834] <- 5
dat$scale[dat$code==084] <- 7
dat$scale[dat$code==0841] <- 7
dat$scale[dat$code==0842] <- 7
dat$scale[dat$code==085] <- 7
dat$scale[dat$code==086] <- 9
dat$scale[dat$code==0861] <- 9
dat$scale[dat$code==0862] <- 9
dat$scale[dat$code==0863] <- 9
dat$scale[dat$code==087] <- 9
dat$scale[dat$code==0871] <- 9
dat$scale[dat$code==0872] <- 9
dat$scale[dat$code==0873] <- 9
dat$scale[dat$code==0874] <- 10
dat$scale[dat$code==090] <- -2
dat$scale[dat$code==091] <- -2
dat$scale[dat$code==092] <- -2
dat$scale[dat$code==093] <- -2
dat$scale[dat$code==094] <- -2
dat$scale[dat$code==100] <- -5
dat$scale[dat$code==101] <- -5
dat$scale[dat$code==1013] <- -5#
dat$scale[dat$code==1014] <- -5#
dat$scale[dat$code==102] <- -5
dat$scale[dat$code==1031] <- -5#
dat$scale[dat$code==1033] <- -5#
dat$scale[dat$code==1041] <- -5
dat$scale[dat$code==1042] <- -5
dat$scale[dat$code==1043] <- -5
dat$scale[dat$code==1044] <- -5
dat$scale[dat$code==105] <- -5
dat$scale[dat$code==1053] <- -5#
dat$scale[dat$code==1056] <- -5#
dat$scale[dat$code==106] <- -5
dat$scale[dat$code==107] <- -5
dat$scale[dat$code==109] <- -5#
dat$scale[dat$code==110] <- -2
dat$scale[dat$code==111] <- -2
dat$scale[dat$code==112] <- -2
dat$scale[dat$code==1121] <- -2
dat$scale[dat$code==1122] <- -2
dat$scale[dat$code==1123] <- -2
dat$scale[dat$code==1124] <- -2
dat$scale[dat$code==1125] <- -2
dat$scale[dat$code==113] <- -2
dat$scale[dat$code==114] <- -2
dat$scale[dat$code==115] <- -2
dat$scale[dat$code==116] <- -2#
dat$scale[dat$code==120] <- -4
dat$scale[dat$code==121] <- -4
dat$scale[dat$code==1211] <- -4
dat$scale[dat$code==1212] <- -4
dat$scale[dat$code==1213] <- -4
dat$scale[dat$code==122] <- -4
dat$scale[dat$code==1222] <- -4#
dat$scale[dat$code==123] <- -4
dat$scale[dat$code==1231] <- -4
dat$scale[dat$code==1233] <- -4
dat$scale[dat$code==124] <- -5
dat$scale[dat$code==1241] <- -5#
dat$scale[dat$code==1243] <- -5#
dat$scale[dat$code==1244] <- -5#
dat$scale[dat$code==1246] <- -5#
dat$scale[dat$code==125] <- -5
dat$scale[dat$code==127] <- -5
dat$scale[dat$code==128] <- -5
dat$scale[dat$code==129] <- -5#
dat$scale[dat$code==130] <- -4.4
dat$scale[dat$code==131] <- -5.8
dat$scale[dat$code==1311] <- -5.8
dat$scale[dat$code==1312] <- -5.8
dat$scale[dat$code==1313] <- -5.8
dat$scale[dat$code==132] <- -5.8
dat$scale[dat$code==1322] <- -5.8
dat$scale[dat$code==133] <- -5.8
dat$scale[dat$code==134] <- -5.8
dat$scale[dat$code==136] <- -7
dat$scale[dat$code==137] <- -7
dat$scale[dat$code==138] <- -7
dat$scale[dat$code==1383] <- -7
dat$scale[dat$code==1384] <- -7
dat$scale[dat$code==139] <- -7
dat$scale[dat$code==140] <- -6.5
dat$scale[dat$code==141] <- -6.5
dat$scale[dat$code==1411] <- -6.5
dat$scale[dat$code==1412] <- -6.5
dat$scale[dat$code==1413] <- -6.5
dat$scale[dat$code==1414] <- -6.5
dat$scale[dat$code==142] <- -6.5
dat$scale[dat$code==143] <- -6.5
dat$scale[dat$code==144] <- -7.5
dat$scale[dat$code==145] <- -7.5
dat$scale[dat$code==150] <- -7.2
dat$scale[dat$code==151] <- -7.2
dat$scale[dat$code==152] <- -7.2
dat$scale[dat$code==153] <- -7.2
dat$scale[dat$code==154] <- -7.2
dat$scale[dat$code==160] <- -4
dat$scale[dat$code==161] <- -4
dat$scale[dat$code==162] <- -5.6
dat$scale[dat$code==1621] <- -5.6
dat$scale[dat$code==1622] <- -5.6
dat$scale[dat$code==163] <- -6.5
dat$scale[dat$code==164] <- -7
dat$scale[dat$code==166] <- -8
dat$scale[dat$code==1662] <- -8#
dat$scale[dat$code==170] <- -7
dat$scale[dat$code==171] <- -9.2
dat$scale[dat$code==1711] <- -9.2
dat$scale[dat$code==1712] <- -9.2
dat$scale[dat$code==172] <- -5
dat$scale[dat$code==1721] <- -5
dat$scale[dat$code==1722] <- -5
dat$scale[dat$code==1723] <- -5
dat$scale[dat$code==1724] <- -5
dat$scale[dat$code==173] <- -5
dat$scale[dat$code==174] <- -5
dat$scale[dat$code==175] <- -9
dat$scale[dat$code==180] <- -9
dat$scale[dat$code==181] <- -9
dat$scale[dat$code==182] <- -9.5
dat$scale[dat$code==1821] <- -9
dat$scale[dat$code==1822] <- -9
dat$scale[dat$code==1823] <- -10
dat$scale[dat$code==183] <- -10
dat$scale[dat$code==1831] <- -10
dat$scale[dat$code==1832] <- -10
dat$scale[dat$code==185] <- -8
dat$scale[dat$code==186] <- -10
dat$scale[dat$code==190] <- -10
dat$scale[dat$code==191] <- -9.5
dat$scale[dat$code==192] <- -9.5
dat$scale[dat$code==193] <- -10
dat$scale[dat$code==194] <- -10
dat$scale[dat$code==195] <- -10
dat$scale[dat$code==196] <- -9.5
dat$scale[dat$code==201] <- -9.5
dat$scale[dat$code==202] <- -10
dat$scale[dat$code==203] <- -10

sort(unique(dat$code)) # no NAs

################################################################

# assign verbal and material cooperation labels

dat$vcoop <- 0
dat$mcoop <- 0
dat$vcon <- 0
dat$mcon <- 0
dat$vcoop[dat$code >=1 & dat$code <=5] <- 1
dat$mcoop[dat$code >=6 & dat$code <=9] <- 1
dat$vcoop[dat$code >=10 & dat$code <=59] <- 1
dat$mcoop[dat$code >=60 & dat$code <=99] <- 1
dat$vcon[dat$code >=100 & dat$code <= 142] <- 1
dat$mcon[dat$code >=143 & dat$code <=204] <- 1
dat$vcoop[dat$code >=211 & dat$code <=356] <- 1
dat$mcoop[dat$code >=811 & dat$code <= 874] <- 1
dat$vcon[dat$code >= 1011 & dat$code <=1424] <- 1
dat$mcon[dat$code >=1431 & dat$code <=2042] <- 1

# exclude all threats
dat$threat = 0
dat$threat[dat$code==13] = 1
dat$threat[dat$code >= 130 & dat$ccode <= 139] = 1
dat$threat[dat$code >= 1311 & dat$ccode <= 1385] = 1
dat$vcon[dat$threat==1] = 0

# exclude threats of war only
#dat$threat = 0
#warthreats = c(137,138,1381,1382,1383,1384,1385,139)
#dat$threat[dat$code %in% warthreats] = 1
#dat$vcon[dat$threat==1] = 0

#13:[-6.0] THREATEN
#130:[-4.4] Threaten, not specified below
#131:[-5.8] Threaten non-force, not specified below
#  1311:[-5.8] Threaten to reduce or stop aid
#  1312:[-5.8] Threaten to boycott, embargo, or sanction
#  1313:[-5.8] Threaten to reduce or break relations
#132:[-5.8] Threaten with administrative sanctions, not specified below
#  1321:[-5.8] Threaten to impose restrictions on freedoms of speech and expression 
#  1322:[-5.8] Threaten to ban political parties or politicians
#  1323:[-5.8] Threaten to impose curfew
#  1324:[-5.8] Threaten to impose state of emergency or martial law
#133:[-5.8] Threaten collective dissent
#134:[-5.8] Threaten to halt negotiations
#135:[-5.8] Threaten to halt mediation
#136:[-7.0] Threaten to expel or withdraw peacekeepers
#137:[-7.0] Threaten with violent repression
#138:[-7.0] Threaten to use military force, not specified below
#  1381:[-7.0] Threaten blockade
#  1382:[-7.0] Threaten occupation 
#  1383:[-7.0] Threaten unconventional violence 
#  1384:[-7.0] Threaten conventional attack  
#  1385:[-7.0] Threaten attack with WMD
#139:[-7.0] Give ultimatum

# note economic verbal conflict -- plausibly NOT diversion

dat$vconeconomic = 0
dat$vconeconomic[dat$code==103 | dat$code==102 | dat$code==1042 | dat$code==115] = 1

#0242:[-0.3] Appeal for policy change
#103:[-5.0] Demand aid, protection, or peacekeeping
#102:[-5.0] Demand policy support
#  1042:[-5.0] Demand policy change
#115:[-2.0] Bring lawsuit against 

# note economic material conflict 
#162:[-5.6] Reduce or stop aid, not specified below
#  1621:[-5.6] Reduce or stop economic assistance
#163:[-6.5] Halt negotiations
#  1643:[-7.0] Expel or withdraw aid agencies
#166:[-8.0] Impose embargo, boycott, or sanctions

dat$mconeconomic = 0
dat$mconeconomic[dat$code==162 | dat$code==1621 | dat$code==1643 | dat$code==166] = 1

################################################################

# save 
write.csv(dat,file="AmericanDiplomacyDataset.csv")

# restrict to US initiated events
dat <- dat[dat$source=="USA",] 
write.csv(dat,file="AmericanDiplomacyDatasetUSinitiated.csv")

################################################################

# set up net conflict, cooperation

c = as.data.frame(sort(unique(dat$target)))
colnames(c) = "country"
head(c)

for(i in 1:nrow(c)){
  c$sum_vcon[i] = sum(dat$vcon[dat$target==c$country[i]])
  c$sum_vcoop[i] = sum(dat$vcoop[dat$target==c$country[i]])
  c$sum_mcon[i] = sum(dat$mcon[dat$target==c$country[i]])
  c$sum_mcoop[i] = sum(dat$mcoop[dat$target==c$country[i]])
  # new pol
  c$sum_vcon_pol[i] = sum(dat$vcon[dat$target==c$country[i] & dat$topic[i]==1])
  c$sum_vcoop_pol[i] = sum(dat$vcoop[dat$target==c$country[i] & dat$topic[i]==1])
  c$sum_mcon_pol[i] = sum(dat$mcon[dat$target==c$country[i] & dat$topic[i]==1])
  c$sum_mcoop_pol[i] = sum(dat$mcoop[dat$target==c$country[i] & dat$topic[i]==1])	
  # new econ
  c$sum_vcon_econ[i] = sum(dat$vcon[dat$target==c$country[i] & dat$topic[i]==2])
  c$sum_vcoop_econ[i] = sum(dat$vcoop[dat$target==c$country[i] & dat$topic[i]==2])
  c$sum_mcon_econ[i] = sum(dat$mcon[dat$target==c$country[i] & dat$topic[i]==2])
  c$sum_mcoop_econ[i] = sum(dat$mcoop[dat$target==c$country[i] & dat$topic[i]==2])	
}

c$net_mcon = c$sum_mcon - c$sum_mcoop
c$net_vcon = c$sum_vcon - c$sum_vcoop
c$net_vcoop = c$sum_vcoop - c$sum_vcon
c$net_mcoop = c$sum_mcoop - c$sum_mcon
# new pol
c$net_mcon_pol = c$sum_mcon_pol - c$sum_mcoop_pol
c$net_vcon_pol = c$sum_vcon_pol - c$sum_vcoop_pol
c$net_vcoop_pol = c$sum_vcoop_pol - c$sum_vcon_pol
c$net_mcoop_pol = c$sum_mcoop_pol - c$sum_mcon_pol
# new econ
c$net_mcon_econ = c$sum_mcon_econ - c$sum_mcoop_econ
c$net_vcon_econ = c$sum_vcon_econ - c$sum_vcoop_econ
c$net_vcoop_econ = c$sum_vcoop_econ - c$sum_vcon_econ
c$net_mcoop_econ = c$sum_mcoop_econ - c$sum_mcon_econ

################################################################

# merge in alliance data (tabari codes are iso3c)

a = read.csv('alliance_v4.1_by_directed_yearly.csv',sep='\t')
a = a[a$state_name1=="United States of America",]
a$target = countrycode(a$ccode2, "cown", "iso3c")
a = a[c("year","target")]
a$alliance = 1

dat$year = as.numeric(substr(dat$date,1,4))

dat$ally = 0
for(i in 1:nrow(dat)){
  if(dat$target[i]==a$target & dat$year[i]==a$year){
    dat$ally[i] = 1
  }
}

################################################################

# move to df (unique dates)

df <- as.data.frame(as.Date(as.Date("1946-01-01"):as.Date("2011-01-01"), origin="1970-01-01"))
colnames(df) <- "date"

coreallyset = c('GBR','AUS','CAN','NZL')
gpset = c('GBR','CHN','FRA','DEU','JPN','RUS')
enemyset = c('CHN','PRK','JPN','RUS','DEU','VMN','IRQ','IRN','AFG','CUB')

for(i in 1:nrow(df)){
  if((i %% 1000==0)==T){print(paste('On',i,'of',nrow(df),'rows,',round((i/nrow(df))*100,2),'percent'))}
  df$scale[i] <- mean(dat$scale[df$date[i]==dat$date]) 	# daily mean scale
  df$vcoop[i] <- sum(dat$vcoop[df$date[i]==dat$date])		# daily vcoop count
  df$mcoop[i] <- sum(dat$mcoop[df$date[i]==dat$date])		# daily mcoop count
  df$vcon[i] <- sum(dat$vcon[df$date[i]==dat$date])		# daily vcon count
  df$mcon[i] <- sum(dat$mcon[df$date[i]==dat$date])		# daily mcon count
  df$threat[i] <- sum(dat$threat[df$date[i]==dat$date])	# daily threat count
  df$vconeconomic[i] <- sum(dat$vconeconomic[df$date[i]==dat$date])	# daily vconeconomic count
  df$mconeconomic[i] <- sum(dat$mconeconomic[df$date[i]==dat$date])	# daily vconeconomic count
  #
  # verbal conflict towards allies, great powers, traditional enemies, core allies:
  #
  df$vconAlly[i] <- sum(dat$vcon[df$date[i]==dat$date & dat$alliance==1])
  df$vconCore[i] <- sum(dat$vcon[df$date[i]==dat$date & dat$target %in% coreallyset])
  df$vconGreat[i] <- sum(dat$vcon[df$date[i]==dat$date & dat$target %in% gpset]) 
  df$vconEnemy[i] <- sum(dat$vcon[df$date[i]==dat$date & dat$target %in% enemyset])
  #
  # verbal cooperation towards allies, great powers, traditional enemies, core allies:
  #
  df$vcoopAlly[i] <- sum(dat$vcoop[df$date[i]==dat$date & dat$alliance==1])
  df$vcoopCore[i] <- sum(dat$vcoop[df$date[i]==dat$date & dat$target %in% coreallyset])
  df$vcoopGreat[i] <- sum(dat$vcoop[df$date[i]==dat$date & dat$target %in% gpset]) 
  df$vcoopEnemy[i] <- sum(dat$vcoop[df$date[i]==dat$date & dat$target %in% enemyset])
  #
  # material conflict towards allies, great powers, traditional enemies, core allies:
  #
  df$mconAlly[i] <- sum(dat$mcon[df$date[i]==dat$date & dat$alliance==1])
  df$mconCore[i] <- sum(dat$mcon[df$date[i]==dat$date & dat$target %in% coreallyset])
  df$mconGreat[i] <- sum(dat$mcon[df$date[i]==dat$date & dat$target %in% gpset]) 
  df$mconEnemy[i] <- sum(dat$mcon[df$date[i]==dat$date & dat$target %in% enemyset])
  #
  # material cooperation towards allies, great powers, traditional enemies, core allies:
  #
  df$mcoopAlly[i] <- sum(dat$mcoop[df$date[i]==dat$date & dat$alliance==1])
  df$mcoopCore[i] <- sum(dat$mcoop[df$date[i]==dat$date & dat$target %in% coreallyset])
  df$mcoopGreat[i] <- sum(dat$mcoop[df$date[i]==dat$date & dat$target %in% gpset]) 
  df$mcoopEnemy[i] <- sum(dat$mcoop[df$date[i]==dat$date & dat$target %in% enemyset])
  #
  # topic of conflict
  #
  df$vconPol[i] <- sum(dat$vcon[df$date[i]==dat$date & dat$topic==1]) # 1 = pm, 2 = econ
  df$vconEcon[i] <- sum(dat$vcon[df$date[i]==dat$date & dat$topic==2]) 
  df$mconPol[i] <- sum(dat$mcon[df$date[i]==dat$date & dat$topic==1]) 
  df$mconEcon[i] <- sum(dat$mcon[df$date[i]==dat$date & dat$topic==2]) 
  #
  df$vcoopPol[i] <- sum(dat$vcoop[df$date[i]==dat$date & dat$topic==1]) # 1 = pm, 2 = econ
  df$vcoopEcon[i] <- sum(dat$vcoop[df$date[i]==dat$date & dat$topic==2]) 
  df$mcoopPol[i] <- sum(dat$mcon[df$date[i]==dat$date & dat$topic==1]) 
  df$mcoopEcon[i] <- sum(dat$mcon[df$date[i]==dat$date & dat$topic==2]) 
}


df$year = as.integer(substr(df$date,1,4))

# ADMINISTRATION (majority of year rule)
df$administration = NA
df$administration[df$year>=1945 & df$year<1953] = "Truman"
df$administration[df$year>=1953 & df$year<1961] = "Eisenhower"
df$administration[df$year>=1961 & df$year<1964] = "Kennedy" # JFK in office til 11/63, so gave him whole year
df$administration[df$year>=1964 & df$year<1969] = "Johnson"
df$administration[df$year>=1969 & df$year<1975] = "Nixon" # Nixon in office til 8/1974, so gave him whole year
df$administration[df$year>=1975 & df$year<1977] = "Ford"
df$administration[df$year>=1977 & df$year<1981] = "Carter"
df$administration[df$year>=1981 & df$year<1989] = "Reagan"
df$administration[df$year>=1989 & df$year<1993] = "Bush Sr"
df$administration[df$year>=1993 & df$year<2001] = "Clinton"
df$administration[df$year>=2001 & df$year<2009] = "Bush Jr"
df$administration[df$year>=2009] = "Obama"

# PRESIDENTIAL APPROVAL

app = read.table('approvalByParty.txt',sep='\t',header=T)
app$month = as.character(as.vector(app$month))
app$month[app$month=="Jan"] = "01"
app$month[app$month=="Feb"] = "02"
app$month[app$month=="Mar"] = "03"
app$month[app$month=="Apr"] = "04"
app$month[app$month=="May"] = "05"
app$month[app$month=="Jun"] = "06"
app$month[app$month=="Jul"] = "07"
app$month[app$month=="Aug"] = "08"
app$month[app$month=="Sep"] = "09"
app$month[app$month=="Oct"] = "10"
app$month[app$month=="Nov"] = "11"
app$month[app$month=="Dec"] = "12"
app$ym = paste(app$year,app$month,sep="-")
df$month = as.character(substr(df$date,6,7))
df$ym = paste(df$year,df$month,sep="-")


# there are occasionally multiple polls per month; this takes mean poll
df[c("demApprove","repApprove","indApprove")] = NA
for(i in 1:nrow(df)){
  if(df$ym[i] %in% app$ym){
    df$demApprove[i] = mean(app$demApprove[app$ym==df$ym[i]],na.rm=T)
    #if(is.na(df$demApprove[i])==F){print(app$demApprove[app$ym==df$ym[i]])}
    df$repApprove[i] =  mean(app$repApprove[app$ym==df$ym[i]],na.rm=T)
    df$indApprove[i] =  mean(app$indApprove[app$ym==df$ym[i]],na.rm=T)
  }
}


app = read.csv('approvalByParty2.csv',sep="\t")
app$year = substr(app$date,5,8)
app$month = substr(app$date,1,3)
app$month = as.character(as.vector(app$month))
app$month[app$month=="Jan"] = "01"
app$month[app$month=="Feb"] = "02"
app$month[app$month=="Mar"] = "03"
app$month[app$month=="Apr"] = "04"
app$month[app$month=="May"] = "05"
app$month[app$month=="Jun"] = "06"
app$month[app$month=="Jul"] = "07"
app$month[app$month=="Aug"] = "08"
app$month[app$month=="Sep"] = "09"
app$month[app$month=="Oct"] = "10"
app$month[app$month=="Nov"] = "11"
app$month[app$month=="Dec"] = "12"
app$ym = paste(app$year,app$month,sep="-")
app$party = as.character(as.vector(app$party))

# only single polls per month in this dataset
for(i in 1:nrow(df)){
  if(df$ym[i] %in% app$ym){
    df$demApprove[i] = app$approve[app$ym==df$ym[i] & app$party=="Democrat"]
    #if(is.na(df$demApprove[i])==F){print(app$approve[app$ym==df$ym[i] & app$party=="Democrat"])}
    df$repApprove[i] = app$approve[app$ym==df$ym[i] & app$party=="Republican"]
    df$indApprove[i] = app$approve[app$ym==df$ym[i] & app$party=="Independent/Other"]
  }
}


# ANES SWING 

dat <- read.dta("anes_cdf.dta");names(dat)
dat <- dat[c("VCF0004","VCF0201","VCF0202","VCF0218","VCF0224","VCF0301","VCF0428")]
colnames(dat) <- c("year","tdem0","trep0","tdem1","trep1","id","tpres")
head(dat)
# this has collapsed all "don't know" etc responses into NA

# mayer measure of swing vote: R party rating - D party rating
dat$gap0 = dat$trep0 - dat$tdem0
dat$gap1 = dat$trep1 - dat$tdem1

dat$swing = NA		
dat$swing[   dat$id== "2. Weak Democrat"| 
             dat$id== "3. Independent - Democrat"| 
             dat$id== "4. Independent - Independent"| 
             dat$id== "5. Independent - Republican"| 
             dat$id== "6. Weak Republican" 
           ] = 1

dat$swing[	 dat$id=="1. Strong Democrat" |
             dat$id=="7. Strong Republican" | 
             dat$id=="0. DK; NA; other; refused to answer; no Pre IW"
           ] = 0

dat$swingstrict = NA
dat$swingstrict[  dat$id=="3. Independent - Democrat" | 
                  dat$id=="4. Independent - Independent" | 
                  dat$id=="5. Independent - Republican"
                ] = 1

dat$swingstrict[  dat$id=="1. Strong Democrat"|
                  dat$id=="2. Weak Democrat"|
                  dat$id=="6. Weak Republican"|
                  dat$id=="7. Strong Republican"|
                  dat$id=="0. DK; NA; other; refused to answer; no Pre IW"
                ] = 0


# annual percent swing respondents

yearframe <- as.data.frame(c(1948:2008))
colnames(yearframe) <- "year"

for(i in 1:nrow(yearframe)){
  yearframe$swing[i] <- mean(na.omit(dat$swing[dat$year==yearframe$year[i]]))
  yearframe$swingstrict[i] <- mean(na.omit(dat$swingstrict[dat$year==yearframe$year[i]]))
}

# check trend in partisan identification
plot(yearframe$year, yearframe$swing)


# annual mean party thermometer gap

for(i in 1:nrow(yearframe)){
  yearframe$gap0[i] <- mean(dat$gap0[dat$year==yearframe$year[i]])
  yearframe$gap1[i] <- mean(dat$gap1[dat$year==yearframe$year[i]])
}

# see if early and late thermometer data consistent 
plot(yearframe$year,yearframe$gap0,col="red",ylim=c(-10,10))
points(yearframe$year,yearframe$gap1,col="navy") 

for(i in 1:nrow(yearframe)){
  yearframe$tdem0[i] <- mean(dat$tdem0[dat$year==yearframe$year[i]])
  yearframe$tdem1[i] <- mean(dat$tdem1[dat$year==yearframe$year[i]])
  yearframe$trep0[i] <- mean(dat$trep0[dat$year==yearframe$year[i]])
  yearframe$trep1[i] <- mean(dat$trep1[dat$year==yearframe$year[i]])
}
plot(yearframe$year,yearframe$tdem0,col="red",ylim=c(50,80))
points(yearframe$year,yearframe$tdem1,col="navy") # this is clear trend

plot(yearframe$year,yearframe$trep0,col="red",ylim=c(50,80))
points(yearframe$year,yearframe$trep1,col="navy") # this is ok, move on

yearframe$tdem0 <- NULL
yearframe$tdem1 <- NULL
yearframe$trep0 <- NULL
yearframe$trep1 <- NULL
yearframe$gap = c(yearframe$gap0[yearframe$year<1978], yearframe$gap1[yearframe$year>=1978])
yearframe$gap0 = NULL
yearframe$gap1 = NULL 

colnames(yearframe) = c("year","anesswing","anesswingstrict","anesgap")
head(yearframe)

nrow(df)
df <- join(df,yearframe,type="left",by=c("year"));nrow(df)



# CPI INDEX

dat = read.csv("CPI.csv",stringsAsFactors=F);head(dat)
colnames(dat) = c("date","cpi")
dat$ym = substr(dat$date,1,7)
dat$date = NULL
nrow(df)
df <- join(df,dat,type="left",by=c("ym"));nrow(df)

# UNEMPLOYMENT

dat = read.csv("UNRATE.csv",stringsAsFactors=F);head(dat)
colnames(dat) = c("date","unrate")
dat$ym = substr(dat$date,1,7)
dat$date = NULL
nrow(df)
df <- join(df,dat,type="left",by=c("ym"));nrow(df)

# S&P 500

dat = read.csv("SP500_monthly.csv",stringsAsFactors=F);head(dat)
colnames(dat) = c("date","sp")
dat$ym = substr(dat$date,1,7)
dat$sp=as.numeric(dat$sp);head(dat) 
dat$date = NULL
nrow(df)
df <- join(df,dat,type="left",by=c("ym"));nrow(df)


# AGGREGATE PRESIDENTIAL APPROVAL

a = read.csv("presidential_approval.csv",stringsAsFactors=F);head(a)
a$date = as.Date(a$end,format="%m/%d/%Y") # take end date of survey as date approval is measured
a$ym = substr(a$date,1,7) 
a = a[c("ym","approve")] 
nrow(df)
# take average of polls each month
df$approve = NA
for(i in 1:nrow(df)){
  if(df$ym[i] %in% a$ym){
    df$approve[i] = mean(a$approve[df$ym[i]==a$ym],na.rm=T)
    #print(a$approve[df$ym[i]==a$ym])
  }
}

# PARTY (0 = R, 1 = D)

df$party = NA
df$party[df$administration=="Truman"] = 1
df$party[df$administration=="Eisenhower"] = 0
df$party[df$administration=="Kennedy"] = 1
df$party[df$administration=="Johnson"] = 1
df$party[df$administration=="Nixon"] = 0
df$party[df$administration=="Ford"] = 0
df$party[df$administration=="Carter"] = 1
df$party[df$administration=="Reagan"] = 0
df$party[df$administration=="Bush Sr"] = 0
df$party[df$administration=="Clinton"] = 1
df$party[df$administration=="Bush Jr"] = 0
df$party[df$administration=="Obama"] = 1

df$dem = 0
df$dem[df$party==1] = 1

df$rep = 0
df$rep[df$party==0] = 1

# SECOND TERM

df$secondterm = 0
df$secondterm[df$year<1945] = NA # 
df$secondterm[df$year>=1949 & df$year<1953] = 1
df$secondterm[df$year>=1957 & df$year<1961] = 1
df$secondterm[df$year>=1973 & df$year<1975] = 1
df$secondterm[df$year>=1985 & df$year<1989] = 1
df$secondterm[df$year>=1997 & df$year<2001] = 1
df$secondterm[df$year>=2005 & df$year<2009] = 1
df$secondterm[df$year>=2009] = 1

# ELECTION YEAR
df$election = 0
df$election[df$year<1945] = NA
electionyears = c(1948,1952,1956,1960,1964,1968,1972,1976,1980,1984,1988,1992,1996,2000,2004,2008,2012)
df$election[df$year %in% electionyears] = 1

# COLD WAR
df$coldwar = 0
df$coldwar[df$year<1945] = NA
df$coldwar[df$year %in% c(1945:1991)] = 1

# 9/11
df$neleven = 0
df$neleven[df$year>2001] = 1
df$neleven[(df$year==2001 & df$month %in% c("09","10","11","12"))] = 1

# DIVIDED GOVERNMENT (new members of congress take office in january)

df$united = 0
un = c(1945,1946,1949,1950,1951,1952,1953,1954,1961,1962,1963,1964,1965,1966,1967,1968,1977,1978,1979,1980,1993,1994,2003,2004,2005,2006,2009,2010)
df$united[df$year %in% un] = 1
# recode Jim Jeffords May 24, 2001 joining Dem caucus; united in 2001 before this
df$united[(df$year==2001 & df$month %in% c("01","02","03","04","05"))] = 1

df$repcontrol = 0
rc = c(1953,1954,2003,2004,2005,2006)
df$repcontrol[df$year %in% rc] = 1
df$repcontrol[(df$year==2001 & df$month %in% c("01","02","03","04","05"))] = 1

df$demcontrol = 0
df$demcontrol[(df$united==1 & df$repcontrol==0)] = 1

# COLLAPSE TO MONTH

df = ddply(df, .(ym), plyr::summarize, 
           year = unique(year,na.rm=T),
           month = unique(month,na.rm=T),
           administration = unique(administration,na.rm=T),
           
           scale = mean(scale,na.rm=T),
           vcoop = sum(vcoop,na.rm=T),
           mcoop = sum(mcoop,na.rm=T),
           vcon = sum(vcon,na.rm=T),
           mcon = sum(mcon,na.rm=T),
           threat = sum(threat,na.rm=T),
           vconeconomic = sum(vconeconomic,na.rm=T),
           mconeconomic = sum(mconeconomic,na.rm=T),
           vconAlly = sum(vconAlly,na.rm=T),
           
           vconCore = sum(vconCore,na.rm=T),
           vconGreat = sum(vconGreat,na.rm=T),
           vconEnemy = sum(vconEnemy,na.rm=T),
           vcoopAlly = sum(vcoopAlly,na.rm=T),
           vcoopCore = sum(vcoopCore,na.rm=T),
           vcoopGreat = sum(vcoopGreat,na.rm=T),
           vcoopEnemy = sum(vcoopEnemy,na.rm=T),
           mconAlly = sum(mconAlly,na.rm=T),
           mconCore = sum(mconCore,na.rm=T),
           mconGreat = sum(mconGreat,na.rm=T),
           mconEnemy = sum(mconEnemy,na.rm=T),
           mcoopAlly = sum(mcoopAlly,na.rm=T),
           mcoopCore = sum(mcoopCore,na.rm=T),
           mcoopGreat = sum(mcoopGreat,na.rm=T),
           mcoopEnemy = sum(mcoopEnemy,na.rm=T),
           vconPol = sum(vconPol,na.rm=T),
           vconEcon = sum(vconEcon,na.rm=T),
           mconPol = sum(mconPol,na.rm=T),
           mconEcon = sum(mconEcon,na.rm=T),
           vcoopPol = sum(vcoopPol,na.rm=T),
           vcoopEcon = sum(vcoopEcon,na.rm=T),
           mcoopPol = sum(mcoopPol,na.rm=T),
           mcoopEcon = sum(mcoopEcon,na.rm=T),
           
           demApprove = unique(demApprove,na.rm=T),
           repApprove = unique(repApprove,na.rm=T),
           indApprove = unique(indApprove,na.rm=T),
           anesswing = unique(anesswing,na.rm=T),
           anesswingstrict = unique(anesswingstrict,na.rm=T),
           anesgap = unique(anesgap,na.rm=T),
           cpi = unique(cpi,na.rm=T),
           unrate = unique(unrate,na.rm=T),
           sp = unique(sp,na.rm=T),
           approve = unique(approve,na.rm=T),
           party = unique(party,na.rm=T),
           dem = unique(dem,na.rm=T),
           rep = unique(rep,na.rm=T),
           secondterm = unique(secondterm,na.rm=T),
           election = unique(election,na.rm=T),
           coldwar = unique(coldwar,na.rm=T),
           neleven = unique(neleven,na.rm=T),
           united = unique(united,na.rm=T),
           repcontrol = unique(repcontrol,na.rm=T),
           demcontrol = unique(demcontrol,na.rm=T)
           
           )

# GDP GROWTH (quarters to months)

dat = read.csv("GDP.csv",stringsAsFactors=F);head(dat)
colnames(dat) = c("date","gdpg")
dat$ym = substr(dat$date,1,7)
dat$gdpg=as.numeric(dat$gdpg)
dat$date = NULL
df$gdpg = NA
for(i in 1:(nrow(df)-2)){
  if(df$ym[i] %in% dat$ym){
    df$gdpg[i:(i+2)] <- dat$gdpg[df$ym[i]==dat$ym]
  }
}

# create delta vars

df$DdemApprove = c(NA,diff(df$demApprove))
df$DrepApprove = c(NA,diff(df$repApprove))
df$DindApprove = c(NA,diff(df$indApprove))


# lag vars of interest 
myvars = names(df)[5:length(names(df))]
for(var in myvars){
  print(var)
  lagvar = paste("L",var,sep="")
  df[lagvar] = NA
  for(i in 2:nrow(df)){
    df[i,lagvar] = df[(i-1),var] 
  }
}

# MISERY INDEX
summary(df$cpi) # index
df$inflation = df$cpi - df$Lcpi
df$misery = df$unrate + df$inflation

# lag misery
df$Lmisery = NA
  for(i in 2:nrow(df)){
    df[i,"Lmisery"] = df[(i-1),"misery"] 
  }


################################################################

# save 
nrow(df) # 781
write.csv(df,'FINALDATA.csv')

################################################################



